This notebook will begin looking at clustering methods on the expression of the genes in a single sample of the dataset of interest, from an unbiased approach.
# Load libraries
library(magrittr)
library(scater)
library(readr)
library(bluster)
library(ggpubr)
library(pheatmap)
# Set file paths
data_dir <- file.path("results", "Gawad_processed_data")
# Source custom functions script
source(file.path("utils", "clustering-functions.R"))
sample_290_normalized <- read_rds(
file.path(data_dir, "SCPCS000216", "SCPCL000290_miQC_processed_sce.rds"))
# Perform k-means clustering
sample_290_normalized <- kmeans_clustering(
sample_290_normalized,
params_range = c(4:10),
step_size = 1
)
# grab column names with clustering results
kmeans_cols <- grep("kmeans", colnames(colData(sample_290_normalized)))
kmeans_cluster_names <- colnames(colData(sample_290_normalized)[, kmeans_cols])
# Plot k-means
kmeans_plot_list <- kmeans_cluster_names %>%
purrr::map(~ plotReducedDim(sample_290_normalized, dimred = "UMAP", colour_by = .x) +
theme_bw()+
theme(text = element_text(size = 22)))
cowplot::plot_grid(plotlist = kmeans_plot_list, ncol = 4)
# Perform graph-based walktrap clustering
sample_290_normalized <- graph_clustering(
sample_290_normalized,
params_range = c(5:25),
step_size = 5,
cluster_type = "walktrap"
)
# grab column names with clustering results
walktrap_cols <- grep("walktrap", colnames(colData(sample_290_normalized)))
walktrap_cluster_names <- colnames(colData(sample_290_normalized)[, walktrap_cols])
# Plot
walktrap_plot_list <- walktrap_cluster_names %>%
purrr::map(~ plotReducedDim(sample_290_normalized, dimred = "UMAP", colour_by = .x) +
theme_bw() +
theme(text = element_text(size = 22)))
cowplot::plot_grid(plotlist = walktrap_plot_list, ncol = 3)
# Perform graph-based louvain clustering
sample_290_normalized <- graph_clustering(
sample_290_normalized,
params_range = c(5:25),
step_size = 5,
cluster_type = "louvain"
)
# grab column names with clustering results
louvain_cols <- grep("walktrap", colnames(colData(sample_290_normalized)))
louvain_cluster_names <- colnames(colData(sample_290_normalized)[, louvain_cols])
# Plot
louvain_plot_list <- louvain_cluster_names %>%
purrr::map(~ plotReducedDim(sample_290_normalized, dimred = "UMAP", colour_by = .x) +
theme_bw() +
theme(text = element_text(size = 22)))
cowplot::plot_grid(plotlist = louvain_plot_list, ncol = 3)
# Check the k-means cluster validity stats for each of the clusters and return
# stats in a data frame
kmeans_stats_df <- create_metadata_stats_df(sample_290_normalized, c(4:10), 1, "kmeans")
Joining, by = "cell_barcode"Joining, by = "cell_barcode"Joining, by = "cell_barcode"Joining, by = "cell_barcode"Joining, by = "cell_barcode"Joining, by = "cell_barcode"Joining, by = "cell_barcode"
# Preview the results
head(kmeans_stats_df)
# Summarize the stats and return in a data frame
kmeans_summary_stats_df <- summarize_clustering_stats(kmeans_stats_df)
`summarise()` has grouped output by 'cluster_names_column', 'cluster_type'. You can override using the `.groups` argument.
# Preview the summary results
head(kmeans_summary_stats_df)
# Plot individual cluster purity stats
kmeans_purity_plots <- plot_cluster_purity(kmeans_stats_df)
kmeans_purity_plots
# Plot individual cluster silhouette width stats
kmeans_silhouette_plots <- plot_cluster_silhouette_width(kmeans_stats_df)
kmeans_silhouette_plots
# Check the walktrap cluster validity stats for each of the clusters and return
# stats in a data frame
walktrap_stats_df <- create_metadata_stats_df(sample_290_normalized, c(5:25), 5, "walktrap")
Joining, by = "cell_barcode"Joining, by = "cell_barcode"Joining, by = "cell_barcode"Joining, by = "cell_barcode"Joining, by = "cell_barcode"
# Preview the all stats results
head(walktrap_stats_df)
# Summarize the stats and return in a data frame
walktrap_summary_stats_df <- summarize_clustering_stats(walktrap_stats_df)
`summarise()` has grouped output by 'cluster_names_column', 'cluster_type'. You can override using the `.groups` argument.
# Preview the summary results
head(walktrap_summary_stats_df)
# Plot individual cluster purity stats
walktrap_purity_plots <- plot_cluster_purity(walktrap_stats_df)
walktrap_purity_plots
# Plot individual cluster silhouette width stats
walktrap_silhouette_plots <- plot_cluster_silhouette_width(walktrap_stats_df)
walktrap_silhouette_plots
# Check the louvain cluster validity stats for each of the clusters and return
# stats in a data frame
louvain_stats_df <- create_metadata_stats_df(sample_290_normalized, c(5:25), 5, "louvain")
Joining, by = "cell_barcode"Joining, by = "cell_barcode"Joining, by = "cell_barcode"Joining, by = "cell_barcode"Joining, by = "cell_barcode"
# Preview the results
head(louvain_stats_df)
# Summarize the stats and return in a data frame
louvain_summary_stats_df <- summarize_clustering_stats(louvain_stats_df)
`summarise()` has grouped output by 'cluster_names_column', 'cluster_type'. You can override using the `.groups` argument.
# Preview the summary results
head(louvain_summary_stats_df)
# Plot individual cluster purity stats
louvain_purity_plots <- plot_cluster_purity(louvain_stats_df)
louvain_purity_plots
# Plot individual cluster silhouette width stats
louvain_silhouette_plots <- plot_cluster_silhouette_width(louvain_stats_df)
louvain_silhouette_plots
summary_stats_df_list <- list("walktrap" = walktrap_summary_stats_df,
"louvain" = louvain_summary_stats_df)
# purity summary plot
plot_avg_validity_stats(summary_stats_df_list, "avg_purity")
#silhouette width summary plot
plot_avg_validity_stats(summary_stats_df_list, "avg_width")
# Check cluster stability
kmeans_ari_df <- get_cluster_stability_summary(sample_290_normalized, c(4:10), 1, "kmeans")
kmeans_ari_df
# plot cluster stability ARI values
plot_cluster_stability_ari(kmeans_ari_df)
Continuous limits supplied to discrete scale.
Did you mean `limits = factor(...)` or `scale_*_continuous()`?
walktrap_ari_df <- get_cluster_stability_summary(sample_290_normalized, c(5:25), 5, cluster_type = "walktrap")
walktrap_ari_df
# plot cluster stability ARI values
plot_cluster_stability_ari(walktrap_ari_df)
Continuous limits supplied to discrete scale.
Did you mean `limits = factor(...)` or `scale_*_continuous()`?
louvain_ari_df <- get_cluster_stability_summary(sample_290_normalized, c(5:25), 5, cluster_type = "louvain")
louvain_ari_df
# plot cluster stability ARI values
plot_cluster_stability_ari(louvain_ari_df)
Continuous limits supplied to discrete scale.
Did you mean `limits = factor(...)` or `scale_*_continuous()`?
summary_ari_df_list <- list("walktrap" = walktrap_ari_df,
"louvain" = louvain_ari_df)
# plot ARI summary plot
plot_summary_cluster_stability_ari(summary_ari_df_list)
sessionInfo()
R version 4.1.1 (2021-08-10)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Catalina 10.15.7
Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats4 stats graphics grDevices datasets utils methods base
other attached packages:
[1] pheatmap_1.0.12 ggpubr_0.4.0 bluster_1.4.0
[4] readr_2.1.1 scater_1.22.0 ggplot2_3.3.6
[7] scuttle_1.4.0 SingleCellExperiment_1.16.0 SummarizedExperiment_1.24.0
[10] Biobase_2.54.0 GenomicRanges_1.46.1 GenomeInfoDb_1.30.0
[13] IRanges_2.28.0 S4Vectors_0.32.3 BiocGenerics_0.40.0
[16] MatrixGenerics_1.6.0 matrixStats_0.61.0 magrittr_2.0.3
loaded via a namespace (and not attached):
[1] bitops_1.0-7 RColorBrewer_1.1-3 tools_4.1.1 backports_1.4.0
[5] utf8_1.2.2 R6_2.5.1 irlba_2.3.3 vipor_0.4.5
[9] colorspace_2.0-3 nnet_7.3-17 withr_2.5.0 tidyselect_1.1.2
[13] gridExtra_2.3 compiler_4.1.1 cli_3.3.0 BiocNeighbors_1.12.0
[17] DelayedArray_0.20.0 labeling_0.4.2 scales_1.2.0 digest_0.6.29
[21] rmarkdown_2.14 XVector_0.34.0 pkgconfig_2.0.3 htmltools_0.5.2
[25] sparseMatrixStats_1.6.0 fastmap_1.1.0 rlang_1.0.2 rstudioapi_0.13
[29] DelayedMatrixStats_1.16.0 farver_2.1.0 generics_0.1.2 jsonlite_1.8.0
[33] BiocParallel_1.28.2 dplyr_1.0.9 car_3.0-12 RCurl_1.98-1.5
[37] BiocSingular_1.10.0 modeltools_0.2-23 GenomeInfoDbData_1.2.7 Matrix_1.4-1
[41] Rcpp_1.0.7 ggbeeswarm_0.6.0 munsell_0.5.0 fansi_1.0.3
[45] abind_1.4-5 viridis_0.6.2 lifecycle_1.0.1 yaml_2.3.5
[49] carData_3.0-4 MASS_7.3-57 zlibbioc_1.40.0 flexmix_2.3-17
[53] grid_4.1.1 parallel_4.1.1 ggrepel_0.9.1 crayon_1.5.1
[57] lattice_0.20-45 splines_4.1.1 cowplot_1.1.1 beachmat_2.10.0
[61] hms_1.1.1 knitr_1.39 pillar_1.7.0 igraph_1.2.9
[65] ggsignif_0.6.3 ScaledMatrix_1.2.0 magic_1.6-0 pdfCluster_1.0-3
[69] glue_1.6.2 evaluate_0.15 renv_0.14.0 BiocManager_1.30.16
[73] tweenr_1.0.2 vctrs_0.4.1 tzdb_0.2.0 polyclip_1.10-0
[77] gtable_0.3.0 purrr_0.3.4 tidyr_1.2.0 ggforce_0.3.3
[81] xfun_0.31 rsvd_1.0.5 broom_0.7.10 miQC_1.2.0
[85] rstatix_0.7.0 viridisLite_0.4.0 geometry_0.4.6 tibble_3.1.7
[89] tinytex_0.38 beeswarm_0.4.0 cluster_2.1.3 ellipsis_0.3.2